In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
iris = pd.read_csv("Iris.csv")
iris.drop('Id',inplace=True,axis=1)
X = iris.iloc[:,:-1].values #Set our training data
y = iris.iloc[:,-1].values #We'll use this just for visualization as clustering doesn't require labels
iris.head().style.background_gradient(cmap=sns.cubehelix_palette(as_cmap=True))
Out[1]:
| SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
|---|---|---|---|---|---|
| 0 | 5.100000 | 3.500000 | 1.400000 | 0.200000 | Iris-setosa |
| 1 | 4.900000 | 3.000000 | 1.400000 | 0.200000 | Iris-setosa |
| 2 | 4.700000 | 3.200000 | 1.300000 | 0.200000 | Iris-setosa |
| 3 | 4.600000 | 3.100000 | 1.500000 | 0.200000 | Iris-setosa |
| 4 | 5.000000 | 3.600000 | 1.400000 | 0.200000 | Iris-setosa |
In [2]:
# Data Distribution => The Data is perfectly balanced
fig = px.pie(iris,'Species',color_discrete_sequence=['#491D8B','#7D3AC1','#EB548C'],title='Data Distribution',template='plotly')
fig.show()
# Phân tích biến Sepal-Length với box-plot
fig = px.box(data_frame=iris, x='Species',y='SepalLengthCm',color='Species',color_discrete_sequence=['#29066B','#7D3AC1','#EB548C'],orientation='v')
fig.show()
# Phân tích biến SepalLengthCm với histogram
fig = px.histogram(data_frame=iris, x='SepalLengthCm',color='Species',color_discrete_sequence=['#491D8B','#7D3AC1','#EB548C'],nbins=50)
fig.show()
# Phân tích biến SepalWidth với box plot
fig = px.box(data_frame=iris, x='Species',y='SepalWidthCm', color='Species',color_discrete_sequence=['#29066B','#7D3AC1','#EB548C'],orientation='v')
fig.show()
# Phân tích 2 biến SepalLengthCm và SepalWidthCm
fig = px.scatter(data_frame=iris, x='SepalLengthCm',y='SepalWidthCm',color='Species',size='PetalLengthCm',template='seaborn',color_discrete_sequence=['#491D8B','#7D3AC1','#EB548C'],)
fig.update_layout(width=800, height=600,xaxis=dict(color="#BF40BF"),yaxis=dict(color="#BF40BF"))
fig.show()
# Phân tích 2 biến PetalLengthCm và PetalWidthCm
fig = px.scatter(data_frame=iris, x='PetalLengthCm',y='PetalWidthCm',color='Species',size='SepalLengthCm',template='seaborn',color_discrete_sequence=['#491D8B','#7D3AC1','#EB548C'],)
fig.update_layout(width=800, height=600,xaxis=dict(color="#BF40BF"),yaxis=dict(color="#BF40BF"))
fig.show()
In [3]:
sse = []
for i in range(1,9):
kmeans = KMeans(n_clusters=i , max_iter=300)
kmeans.fit(X)
sse.append(kmeans.inertia_)
fig = px.line(y=sse,template="seaborn",title='Eblow Method')
fig.update_layout(width=800, height=600,
title_font_color="#BF40BF",
xaxis=dict(color="#BF40BF",title="Clusters"),
yaxis=dict(color="#BF40BF",title="SSE"))
Số cluster tối ưu = 3
In [4]:
kmeans = KMeans(n_clusters = 3,
init = 'k-means++',
max_iter = 300, n_init = 10, random_state = 0)
clusters = kmeans.fit_predict(X)
In [5]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=X[clusters == 0, 0], y=X[clusters == 0, 1],mode='markers',marker_color='#DB4CB2',name='Iris-setosa'))
fig.add_trace(go.Scatter(x=X[clusters == 1, 0], y=X[clusters == 1, 1],mode='markers',marker_color='#c9e9f6',name='Iris-versicolour'))
fig.add_trace(go.Scatter(x=X[clusters == 2, 0], y=X[clusters == 2, 1],mode='markers',marker_color='#7D3AC1',name='Iris-virginica'))
fig.add_trace(go.Scatter(x=kmeans.cluster_centers_[:, 0], y= kmeans.cluster_centers_[:,1],mode='markers',marker_color='#CAC9CD',marker_symbol=4,marker_size=13,name='Centroids'))
fig.update_layout(template='plotly_dark',width=1000,height=500,title='Kmean Clustering Results')